import os, sysos.environ["PYSPARK_PYTHON"] = sys.executableos.environ["PYSPARK_DRIVER_PYTHON"] = sys.executableos.environ.pop("SPARK_HOME", None)os.environ.pop("SPARK_DIST_CLASSPATH", None)os.makedirs("./output", exist_ok=True)from pyspark.sql import SparkSessionfrom pyspark.sql.functions import col, monotonically_increasing_idfrom pyspark.sql import functions as Fimport pandas as pdimport numpy as npimport plotly.io as pionp.random.seed(42)pio.renderers.default ="notebook"# Initialize Spark Sessionspark = SparkSession.builder.appName("LightcastData").getOrCreate()# Load Datadf = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")df.createOrReplaceTempView("job_postings")# Show Schema and Sample Data#print("---This is Diagnostic check, No need to print it in the final doc---")df.printSchema() # comment this line when rendering the submission#df.show(5)df = df.withColumn("SALARY", col("SALARY").cast("float")) \ .withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))def compute_median(sdf, col_name): q = sdf.approxQuantile(col_name, [0.5], 0.01)return q[0] if q elseNonemedian_salary = compute_median(df, "SALARY")print("Median SALARY:", median_salary)df = df.fillna({"SALARY": median_salary})df = df.withColumn("Average_Salary", col("SALARY"))export_cols = ["EDUCATION_LEVELS_NAME","REMOTE_TYPE_NAME","MAX_YEARS_EXPERIENCE","Average_Salary","SALARY","LOT_V6_SPECIALIZED_OCCUPATION_NAME","EMPLOYMENT_TYPE_NAME"]df_selected = df.select(*export_cols)pdf= df_selected.toPandas()pdf.to_csv("./output/cleaned_subset.csv", index=False)print("Data Cleaning Complete. Rows Retained:", len(pdf))
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/24 18:59:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:> (0 + 1) / 1] [Stage 1:> (0 + 1) / 1]